# -*-coding:utf-8-*-
__author__ = 'Administrator'

from bs4 import BeautifulSoup
import urllib
import urllib2
import re
import MySQLdb
import cgi
import sys
import time
import chardet

def parsing_html(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    content = response.read()
    soup = BeautifulSoup(content,'html.parser',from_encoding='GBK')
    return soup

reload(sys)
sys.setdefaultencoding('utf-8')

sina = 'http://wei.sohu.com/roll/'


soup = parsing_html(sina)

links = soup.select('.f14list a')


db = MySQLdb.connect("localhost","root","","wordpress",charset="UTF8")


cursor = db.cursor()
cursor.execute('SET NAMES utf8')
for link in links:
    soup_ = parsing_html(link['href'])
    article = soup_.select('#contentText div')
    # print soup_.original_encoding
    # print link.string
    typeEncode = sys.getfilesystemencoding()
    #infoencode = chardet.detect(article[0]).get('encoding','utf-8')
    text = article[0]
    text = str(text)
    html = cgi.escape(text)
    #html = html.decode('utf-8').encode('utf-8')

    #print html
    #infoencode = chardet.detect(html).get('encoding','utf-8')
    #html = html.decode(infoencode,'ignore').encode('utf-8')
    #print html.decode('utf-8')

    #a = chardet.detect(link.string)
    str_ = '_wei.sohu.com/20160120/n435160596.shtml'
    html = html.replace('\'','@@@')
    # html = html.replace('\"','@@@')
    print link.string
    #t_content = t_content.decode("gbk").encode("utf-8")
    ntime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))

    #sql = "INSERT INTO py_sina(title, url, content) VALUES ( '%s','%s','%s')" % ( link.string, link['href'], html)

    sql = "INSERT INTO wp_posts(ID, post_author, post_date, post_date_gmt, post_content, post_title, post_excerpt, post_status, comment_status, ping_status, post_password, post_name, to_ping,pinged, post_modified, post_modified_gmt, post_content_filtered, post_parent, guid, menu_order, post_type, post_mime_type, comment_count) VALUES ( '%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % ('', 1, ntime, ntime, html, link.string, '', 'publish', 'open', 'open', '', link.string, '', '', ntime, ntime, '', 0,'', 0, 'post', '',0 )

    #print article[0]#.decode('GBK').encode('utf-8')
    cursor.execute(sql)
    lastid = int(cursor.lastrowid)
    url_loca = 'http://localhost/wordpress/?p=' + str(lastid)
    print url_loca
    save_sql = "UPDATE wp_posts set post_parent=%s, guid='%s' where id = '%s'" % (lastid,url_loca,lastid)
    cursor.execute(save_sql)

    db.commit()

    pass

cursor.close()

